{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import csv\n",
    "import json\n",
    "import os\n",
    "from io import StringIO\n",
    "import io\n",
    "import unicodedata\n",
    "import re"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_text_data(path, cols, csv_path):\n",
    "    w = csv.writer(open(csv_path,'w'))\n",
    "    w.writerow(cols)\n",
    "    l_files = os.listdir(path)\n",
    "    for l in l_files:\n",
    "        fpath = os.path.join(path, l)\n",
    "        with open(fpath) as file:\n",
    "            w.writerow([l, file.read()])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "path = os.path.join(os.getcwd(),'ETS_Corpus_of_Non-Native_Written_English/data/text/responses/original')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "directory = 'data'\n",
    "if not os.path.exists(directory):\n",
    "    os.makedirs(directory)\n",
    "    \n",
    "directory = 'data/TOEFL'\n",
    "if not os.path.exists(directory):\n",
    "    os.makedirs(directory)\n",
    "    \n",
    "data_path = os.path.join(directory, 'textdata.csv')\n",
    "get_text_data(path, ['Filename', 'text'], data_path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "text_pd = pd.read_csv(data_path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "path_csv = os.path.join(os.getcwd(),'ETS_Corpus_of_Non-Native_Written_English/data/text')\n",
    "l_csv = ['train', 'dev', 'test']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "l_csvfiles = os.listdir(path_csv)\n",
    "for i in l_csvfiles:\n",
    "    if l_csv[0] in i:\n",
    "        pd_ = pd.read_csv(os.path.join(path_csv,i), header = None, names = ['Filename', 'prompt', 'lang', 'score'])\n",
    "        df_train = pd_.merge(text_pd ,on='Filename', how = 'inner')\n",
    "    if l_csv[1] in i:\n",
    "        pd_ = pd.read_csv(os.path.join(path_csv,i), header = None, names = ['Filename', 'prompt', 'lang', 'score'])\n",
    "        df_dev = pd_.merge(text_pd ,on='Filename', how = 'inner')\n",
    "    if l_csv[2] in i:\n",
    "        pd_ = pd.read_csv(os.path.join(path_csv,i), header = None, names = ['Filename', 'prompt', 'lang', 'score'])\n",
    "        df_test = pd_.merge(text_pd ,on='Filename', how = 'inner')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "def clean(t_):\n",
    "    t_ = re.sub('\\s+',' ',t_)\n",
    "    t_ = re.sub('- ','',t_)\n",
    "    #url_reg  = r'[a-z]*[:.]+\\S+'\n",
    "    #t_ = re.sub(url_reg, '', t_)\n",
    "    t_ = re.sub('([.,!?()])', r' \\1 ', t_)\n",
    "    t_ = re.sub('\\\"', ' \\\" ',t_)\n",
    "    t_ = re.sub('$', ' $ ',t_)\n",
    "    t_ = re.sub(r'\\'s', ' \\'s', t_)\n",
    "    t_ = re.sub(r'\\'re', ' \\'re', t_)\n",
    "    t_ = re.sub(r'\\'ll', ' \\'ll', t_)\n",
    "    t_ = re.sub(r'\\'m', ' \\'m', t_)\n",
    "    t_ = re.sub(r'\\'d', ' \\'d', t_)\n",
    "    t_ = re.sub(r'can\\'t', 'can n\\'t', t_)\n",
    "    t_ = re.sub(r'n\\'t', ' n\\'t', t_)\n",
    "    t_ = re.sub(r'sn\\'t', 's n\\'t', t_)\n",
    "    t_ = re.sub('\\s{2,}', ' ', t_)\n",
    "    t_ = t_.lower()\n",
    "    mydict = us_gb_dict()\n",
    "    t_ = replace_all(t_, mydict)\n",
    "    return(t_)\n",
    "\n",
    "def clean_par(t_):\n",
    "    #t_ = re.sub('\\s+',' ',t_)\n",
    "    t_ = re.sub('- ','',t_)\n",
    "    #url_reg  = r'[a-z]*[:.]+\\S+'\n",
    "    #t_ = re.sub(url_reg, '', t_)\n",
    "    t_ = re.sub('([.,!?()])', r' \\1 ', t_)\n",
    "    t_ = re.sub('\\\"', ' \\\" ',t_)\n",
    "    t_ = re.sub('$', ' $ ',t_)\n",
    "    t_ = re.sub(r'\\'s', ' \\'s', t_)\n",
    "    t_ = re.sub(r'\\'re', ' \\'re', t_)\n",
    "    t_ = re.sub(r'\\'ll', ' \\'ll', t_)\n",
    "    t_ = re.sub(r'\\'m', ' \\'m', t_)\n",
    "    t_ = re.sub(r'\\'d', ' \\'d', t_)\n",
    "    t_ = re.sub(r'can\\'t', 'can n\\'t', t_)\n",
    "    t_ = re.sub(r'n\\'t', ' n\\'t', t_)\n",
    "    t_ = re.sub(r'sn\\'t', 's n\\'t', t_)\n",
    "    #t_ = re.sub('\\s{2,}', ' ', t_)\n",
    "    t_ = t_.lower()\n",
    "    mydict = us_gb_dict()\n",
    "    t_ = replace_all(t_, mydict)\n",
    "    return(t_)\n",
    "\n",
    "\n",
    "def us_gb_dict():    \n",
    "    filepath = 'us_gb.txt'\n",
    "    with open(filepath, 'r') as fp:  \n",
    "        read = fp.read()\n",
    "    us = []\n",
    "    gb = []\n",
    "    gb_f = True\n",
    "\n",
    "    for i in read.splitlines():\n",
    "        line = i.strip()\n",
    "        #print(line)\n",
    "        if line == \"US\":\n",
    "            gb_f = False      \n",
    "        elif gb_f == True:\n",
    "            gb.append(line)\n",
    "        else:\n",
    "            us.append(line)\n",
    "    us2gb = dict(zip(gb, us))\n",
    "    return us2gb\n",
    "\n",
    "\n",
    "def replace_all(text, mydict):    \n",
    "    for gb, us in mydict.items():\n",
    "        text = text.replace(gb, us)\n",
    "    return text"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "def one_hot_score(df, col = 'score'):\n",
    "    s = []\n",
    "    for i in df[col]:\n",
    "        if str(i) == 'low':\n",
    "            s.append(1)\n",
    "        if str(i) == 'medium':\n",
    "            s.append(2)\n",
    "        if str(i) == 'high':\n",
    "            s.append(3)\n",
    "    \n",
    "    df['score_onehot'] = s\n",
    "    return df\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "def clean_text(df, col = 'text'):\n",
    "    t = []\n",
    "    t_par = []\n",
    "    for i in df[col]:\n",
    "        t.append(clean(i))\n",
    "        t_par.append(clean_par(i))\n",
    "    df['text1'] = t\n",
    "    df['text_par'] = t_par\n",
    "    return df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_test['score'] = df_test['lang']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_test = clean_text(df_test)\n",
    "df_test = one_hot_score(df_test)\n",
    "df_test.to_csv(os.path.join(directory,'test.csv'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_train = clean_text(df_train)\n",
    "df_train = one_hot_score(df_train)\n",
    "df_train.to_csv(os.path.join(directory,'train.csv'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_dev = clean_text(df_dev)\n",
    "df_dev = one_hot_score(df_dev)\n",
    "df_dev.to_csv(os.path.join(directory,'dev.csv'))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "os.remove(data_path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
